cd ../..
/Users/shanekercheval/repos/data-science-template
timestamp = '2022_06_05_12_33_24'
%run "source/config/notebook_settings.py"
from source.library.utilities import Timer, log_info, get_config
config = get_config()
experiment_directory = os.path.join(config['EXPERIMENTS']['DIRECTORY'], f"experiment__{timestamp}")
log_info(f"Experiment Directory: {experiment_directory}")
def file_path(file_name):
return os.path.join(experiment_directory, file_name)
2022-06-05 12:33:48 - INFO | Experiment Directory: artifacts/models/experiments/experiment__2022_06_05_12_33_24
with Timer("Loading training/test datasets"):
X_train = pd.pandas.read_pickle(file_path('x_train.pkl'))
X_test = pd.pandas.read_pickle(file_path('x_test.pkl'))
y_train = pd.pandas.read_pickle(file_path('y_train.pkl'))
y_test = pd.pandas.read_pickle(file_path('y_test.pkl'))
2022-06-05 12:33:48 - INFO | *****Timer Started: Loading training/test datasets 2022-06-05 12:33:48 - INFO | *****Timer Finished (0.00 seconds)
log_info(X_train.shape)
log_info(len(y_train))
log_info(X_test.shape)
log_info(len(y_test))
2022-06-05 12:33:48 - INFO | (800, 20) 2022-06-05 12:33:48 - INFO | 800 2022-06-05 12:33:48 - INFO | (200, 20) 2022-06-05 12:33:48 - INFO | 200
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
np.unique(y_test, return_counts=True)[1] / np.sum(np.unique(y_test, return_counts=True)[1])
array([0.705, 0.295])
file_name = file_path('experiment.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = file_name)
log_info(f"Best Score: {results.best_score}")
2022-06-05 12:33:48 - INFO | Best Score: 0.7668027026011365
log_info(f"Best Params: {results.best_params}")
2022-06-05 12:33:48 - INFO | Best Params: {'model': 'RandomForestClassifier()', 'imputer': 'SimpleImputer()', 'scaler': 'None', 'pca': 'None', 'encoder': 'OneHotEncoder()'}
# Best model from each model-type.
df = results.to_formatted_dataframe(return_style=False, include_rank=True)
df["model_rank"] = df.groupby("model")["roc_auc Mean"].rank(method="first", ascending=False)
df.query('model_rank == 1')
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | learning_rate | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | scaler | pca | encoder | model_rank | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15 | 1 | 0.77 | 0.72 | 0.81 | RandomForestClassifier() | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | None | None | OneHotEncoder() | 1.00 |
| 11 | 2 | 0.77 | 0.71 | 0.82 | ExtraTreesClassifier() | NaN | 0.11 | 70.00 | 553.00 | 12.00 | 8.00 | 0.55 | entropy | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SimpleImputer(strategy='median') | None | PCA('mle') | OneHotEncoder() | 1.00 |
| 0 | 3 | 0.76 | 0.72 | 0.80 | LogisticRegression() | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SimpleImputer() | StandardScaler() | None | OneHotEncoder() | 1.00 |
| 7 | 4 | 0.76 | 0.72 | 0.80 | LinearSVC() | 0.28 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | PCA('mle') | OneHotEncoder() | 1.00 |
| 23 | 10 | 0.75 | 0.69 | 0.81 | XGBClassifier() | NaN | NaN | 2.00 | 1397.00 | NaN | NaN | NaN | NaN | 0.03 | 16.00 | 0.61 | 0.88 | 0.81 | 0.10 | 1.97 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() | 1.00 |
results.to_formatted_dataframe(return_style=True,
include_rank=True,
num_rows=1000)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | model | C | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | learning_rate | min_child_weight | subsample | colsample_bytree | colsample_bylevel | reg_alpha | reg_lambda | imputer | scaler | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.767 | 0.720 | 0.814 | RandomForestClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 2 | 0.766 | 0.707 | 0.825 | ExtraTreesClassifier() | <NA> | 0.114 | 70.000 | 553.000 | 12.000 | 8.000 | 0.548 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | OneHotEncoder() |
| 3 | 0.763 | 0.725 | 0.802 | LogisticRegression() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 4 | 0.761 | 0.720 | 0.803 | LinearSVC() | 0.281 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | PCA('mle') | OneHotEncoder() |
| 5 | 0.761 | 0.697 | 0.825 | LogisticRegression() | 0.001 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | MinMaxScaler() | None | OneHotEncoder() |
| 6 | 0.757 | 0.711 | 0.803 | ExtraTreesClassifier() | <NA> | 0.681 | 38.000 | 1,461.000 | 23.000 | 10.000 | 0.553 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | None | CustomOrdinalEncoder() |
| 7 | 0.756 | 0.711 | 0.802 | RandomForestClassifier() | <NA> | 0.685 | 30.000 | 1,659.000 | 25.000 | 11.000 | 0.781 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 8 | 0.753 | 0.716 | 0.791 | RandomForestClassifier() | <NA> | 0.303 | 81.000 | 1,063.000 | 15.000 | 27.000 | 0.502 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 9 | 0.752 | 0.698 | 0.805 | ExtraTreesClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 10 | 0.751 | 0.695 | 0.808 | XGBClassifier() | <NA> | <NA> | 2.000 | 1,397.000 | <NA> | <NA> | <NA> | <NA> | 0.031 | 16.000 | 0.608 | 0.881 | 0.812 | 0.096 | 1.971 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 11 | 0.751 | 0.713 | 0.789 | RandomForestClassifier() | <NA> | 0.328 | 5.000 | 1,047.000 | 23.000 | 43.000 | 0.957 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | OneHotEncoder() |
| 12 | 0.751 | 0.721 | 0.781 | LinearSVC() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 13 | 0.747 | 0.694 | 0.799 | ExtraTreesClassifier() | <NA> | 0.710 | 15.000 | 1,493.000 | 33.000 | 27.000 | 0.914 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | PCA('mle') | OneHotEncoder() |
| 14 | 0.746 | 0.716 | 0.776 | LogisticRegression() | 23.327 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | None | OneHotEncoder() |
| 15 | 0.745 | 0.704 | 0.786 | RandomForestClassifier() | <NA> | 0.762 | 88.000 | 1,235.000 | 8.000 | 7.000 | 0.666 | gini | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | None | PCA('mle') | CustomOrdinalEncoder() |
| 16 | 0.738 | 0.686 | 0.790 | XGBClassifier() | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | None | OneHotEncoder() |
| 17 | 0.736 | 0.695 | 0.777 | ExtraTreesClassifier() | <NA> | 0.740 | 14.000 | 1,645.000 | 5.000 | 43.000 | 0.741 | entropy | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | None | PCA('mle') | CustomOrdinalEncoder() |
| 18 | 0.730 | 0.702 | 0.758 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | None | CustomOrdinalEncoder() |
| 19 | 0.727 | 0.690 | 0.765 | LinearSVC() | 0.361 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | MinMaxScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 20 | 0.726 | 0.697 | 0.755 | LogisticRegression() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 21 | 0.723 | 0.676 | 0.771 | XGBClassifier() | <NA> | <NA> | 8.000 | 1,657.000 | <NA> | <NA> | <NA> | <NA> | 0.084 | 5.000 | 0.977 | 0.847 | 0.788 | 0.417 | 1.290 | SimpleImputer(strategy='median') | None | None | OneHotEncoder() |
| 22 | 0.717 | 0.676 | 0.758 | XGBClassifier() | <NA> | <NA> | 13.000 | 1,153.000 | <NA> | <NA> | <NA> | <NA> | 0.026 | 3.000 | 0.685 | 0.549 | 0.802 | 0.016 | 2.353 | SimpleImputer() | None | PCA('mle') | CustomOrdinalEncoder() |
| 23 | 0.714 | 0.667 | 0.761 | XGBClassifier() | <NA> | <NA> | 12.000 | 945.000 | <NA> | <NA> | <NA> | <NA> | 0.156 | 21.000 | 0.990 | 0.851 | 0.658 | 0.071 | 3.523 | SimpleImputer() | None | PCA('mle') | OneHotEncoder() |
| 24 | 0.701 | 0.669 | 0.733 | LinearSVC() | 10.021 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | MinMaxScaler() | PCA('mle') | CustomOrdinalEncoder() |
| 25 | 0.660 | 0.610 | 0.710 | LinearSVC() | 0.000 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | None | OneHotEncoder() |
results.to_formatted_dataframe(query='model == "RandomForestClassifier()"', include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | max_features | max_depth | n_estimators | min_samples_split | min_samples_leaf | max_samples | criterion | imputer | pca | encoder |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.767 | 0.720 | 0.814 | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | <NA> | SimpleImputer() | None | OneHotEncoder() |
| 2 | 0.756 | 0.711 | 0.802 | 0.685 | 30.000 | 1,659.000 | 25.000 | 11.000 | 0.781 | gini | SimpleImputer() | PCA('mle') | OneHotEncoder() |
| 3 | 0.753 | 0.716 | 0.791 | 0.303 | 81.000 | 1,063.000 | 15.000 | 27.000 | 0.502 | gini | SimpleImputer(strategy='median') | None | OneHotEncoder() |
| 4 | 0.751 | 0.713 | 0.789 | 0.328 | 5.000 | 1,047.000 | 23.000 | 43.000 | 0.957 | entropy | SimpleImputer(strategy='median') | PCA('mle') | OneHotEncoder() |
| 5 | 0.745 | 0.704 | 0.786 | 0.762 | 88.000 | 1,235.000 | 8.000 | 7.000 | 0.666 | gini | SimpleImputer(strategy='median') | PCA('mle') | CustomOrdinalEncoder() |
results.to_formatted_dataframe(query='model == "LogisticRegression()"', include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | pca | encoder |
|---|---|---|---|---|---|---|---|---|
| 1 | 0.763 | 0.725 | 0.802 | <NA> | SimpleImputer() | StandardScaler() | None | OneHotEncoder() |
| 2 | 0.761 | 0.697 | 0.825 | 0.001 | SimpleImputer(strategy='median') | MinMaxScaler() | None | OneHotEncoder() |
| 3 | 0.746 | 0.716 | 0.776 | 23.327 | SimpleImputer(strategy='median') | StandardScaler() | None | OneHotEncoder() |
| 4 | 0.730 | 0.702 | 0.758 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | None | CustomOrdinalEncoder() |
| 5 | 0.726 | 0.697 | 0.755 | 0.000 | SimpleImputer(strategy='median') | StandardScaler() | PCA('mle') | CustomOrdinalEncoder() |
results.plot_performance_across_trials(facet_by='model').show()
results.plot_performance_across_trials(query='model == "RandomForestClassifier()"').show()
results.plot_parameter_values_across_trials(query='model == "RandomForestClassifier()"').show()
# results.plot_scatter_matrix(query='model == "RandomForestClassifier()"',
# height=1000, width=1000).show()
results.plot_performance_numeric_params(query='model == "RandomForestClassifier()"',
height=800)
results.plot_parallel_coordinates(query='model == "RandomForestClassifier()"').show()
results.plot_performance_non_numeric_params(query='model == "RandomForestClassifier()"').show()
results.plot_score_vs_parameter(
query='model == "RandomForestClassifier()"',
parameter='max_features',
size='max_depth',
color='encoder',
)
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='max_depth'
# )
# results.plot_parameter_vs_parameter(
# query='model == "XGBClassifier()"',
# parameter_x='colsample_bytree',
# parameter_y='learning_rate',
# size='imputer'
# )
file_name = file_path('experiment_best_estimator.pkl')
best_estimator = hlp.utility.read_pickle(file_name)
x_test = pd.read_pickle(file_path('x_test.pkl'))
x_test.head()
| checking_status | duration | credit_history | purpose | credit_amount | savings_status | employment | installment_commitment | personal_status | other_parties | residence_since | property_magnitude | age | other_payment_plans | housing | existing_credits | job | num_dependents | own_telephone | foreign_worker | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 521 | <0 | 18.00 | existing paid | radio/tv | 3190.00 | <100 | 1<=X<4 | 2.00 | female div/dep/mar | none | 2.00 | real estate | 24.00 | none | own | 1.00 | skilled | 1.00 | none | yes |
| 737 | <0 | 18.00 | existing paid | new car | 4380.00 | 100<=X<500 | 1<=X<4 | 3.00 | male single | none | 4.00 | car | 35.00 | none | own | 1.00 | unskilled resident | 2.00 | yes | yes |
| 740 | <0 | 24.00 | all paid | new car | 2325.00 | 100<=X<500 | 4<=X<7 | 2.00 | male single | none | 3.00 | car | 32.00 | bank | own | 1.00 | skilled | 1.00 | none | yes |
| 660 | >=200 | 12.00 | existing paid | radio/tv | 1297.00 | <100 | 1<=X<4 | 3.00 | male mar/wid | none | 4.00 | real estate | 23.00 | none | rent | 1.00 | skilled | 1.00 | none | yes |
| 411 | no checking | 33.00 | critical/other existing credit | used car | 7253.00 | <100 | 4<=X<7 | 3.00 | male single | none | 2.00 | car | 35.00 | none | own | 2.00 | high qualif/self emp/mgmt | 1.00 | yes | yes |
y_test = hlp.utility.read_pickle(file_path('y_test.pkl'))
y_test[0:10]
array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
test_predictions = best_estimator.predict_proba(x_test)[:, 1]
test_predictions[0:10]
array([0.402, 0.494, 0.722, 0.374, 0.056, 0.476, 0.084, 0.476, 0.18 ,
0.232])
evaluator = hlp.sklearn_eval.TwoClassEvaluator(
actual_values=y_test,
predicted_scores=test_predictions,
score_threshold=0.37
)
evaluator.plot_actual_vs_predict_histogram()
evaluator.plot_confusion_matrix()
evaluator.all_metrics_df(return_style=True,
dummy_classifier_strategy=['prior', 'constant'],
round_by=3)
| Score | Dummy (prior) | Dummy (constant) | Explanation | |
|---|---|---|---|---|
| AUC | 0.825 | 0.500 | 0.500 | Area under the ROC curve (true pos. rate vs false pos. rate); ranges from 0.5 (purely random classifier) to 1.0 (perfect classifier) |
| True Positive Rate | 0.746 | 0.000 | 1.000 | 74.6% of positive instances were correctly identified.; i.e. 44 "Positive Class" labels were correctly identified out of 59 instances; a.k.a Sensitivity/Recall |
| True Negative Rate | 0.801 | 1.000 | 0.000 | 80.1% of negative instances were correctly identified.; i.e. 113 "Negative Class" labels were correctly identified out of 141 instances |
| False Positive Rate | 0.199 | 0.000 | 1.000 | 19.9% of negative instances were incorrectly identified as positive; i.e. 28 "Negative Class" labels were incorrectly identified as "Positive Class", out of 141 instances |
| False Negative Rate | 0.254 | 1.000 | 0.000 | 25.4% of positive instances were incorrectly identified as negative; i.e. 15 "Positive Class" labels were incorrectly identified as "Negative Class", out of 59 instances |
| Positive Predictive Value | 0.611 | 0.000 | 0.295 | When the model claims an instance is positive, it is correct 61.1% of the time; i.e. out of the 72 times the model predicted "Positive Class", it was correct 44 times; a.k.a precision |
| Negative Predictive Value | 0.883 | 0.705 | 0.000 | When the model claims an instance is negative, it is correct 88.3% of the time; i.e. out of the 128 times the model predicted "Negative Class", it was correct 113 times |
| F1 Score | 0.672 | 0.000 | 0.456 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. |
| Precision/Recall AUC | 0.667 | 0.295 | 0.295 | Precision/Recall AUC is calculated with `average_precision` which summarizes a precision-recall curve as the weighted mean of precisions achieved at each threshold. See sci-kit learn documentation for caveats. |
| Accuracy | 0.785 | 0.705 | 0.295 | 78.5% of instances were correctly identified |
| Error Rate | 0.215 | 0.295 | 0.705 | 21.5% of instances were incorrectly identified |
| % Positive | 0.295 | 0.295 | 0.295 | 29.5% of the data are positive; i.e. out of 200 total observations; 59 are labeled as "Positive Class" |
| Total Observations | 200 | 200 | 200 | There are 200 total observations; i.e. sample size |
evaluator.plot_roc_auc_curve().show()
<Figure size 720x444.984 with 0 Axes>
evaluator.plot_precision_recall_auc_curve().show()
evaluator.plot_threshold_curves(score_threshold_range=(0.1, 0.7)).show()
evaluator.plot_precision_recall_tradeoff(score_threshold_range=(0.1, 0.6)).show()
evaluator.calculate_lift_gain(return_style=True)
| Gain | Lift | |
|---|---|---|
| Percentile | ||
| 5 | 0.14 | 2.71 |
| 10 | 0.24 | 2.37 |
| 15 | 0.37 | 2.49 |
| 20 | 0.49 | 2.46 |
| 25 | 0.56 | 2.24 |
| 30 | 0.64 | 2.15 |
| 35 | 0.75 | 2.13 |
| 40 | 0.76 | 1.91 |
| 45 | 0.80 | 1.77 |
| 50 | 0.83 | 1.66 |
| 55 | 0.85 | 1.54 |
| 60 | 0.88 | 1.47 |
| 65 | 0.88 | 1.36 |
| 70 | 0.93 | 1.33 |
| 75 | 0.95 | 1.27 |
| 80 | 0.97 | 1.21 |
| 85 | 0.98 | 1.16 |
| 90 | 1.00 | 1.11 |
| 95 | 1.00 | 1.05 |
| 100 | 1.00 | 1.00 |